While examining the NYPD crime complaint data, we expect crime rate to be indicative by the socioeconomic status of a specific neighborhood. In this section, we calculated the crime rate for each neighborhood each year, and displayed the top neighborhoods with highest crime rate, along with what their crime composition looks like. We are also interested in whether crime rate can be predictive of rent in corresponding neighborhoods.
# Import and clead NYPD data
df_nypd = read_csv("https://www.dropbox.com/scl/fi/1j6emjrdo3zbjrt5ehgq0/NYPD_Complaint_Data_Historic.csv?rlkey=co02d6vsgxr0aj44r7uhnbz64&dl=1", na = "(null)") |> # some values are coded as "(null)" in the df; rewrite them as NA
janitor::clean_names()
## Siqing' note: I'm currently reading from my local file. Yuki, when you run this file please comment out the chunk below and use the dropbox. Thanks!!
# df_nypd = read_csv("../NYPD_Complaint_Data_Historic.csv") |>
# janitor::clean_names()
prec_neighbor = read_csv("data/nyc_prec_neighborhood.csv")
#Merging neighborhood information with NYPD data, keep only relevant variables
nypd_ses_df = df_nypd |>
select(cmplnt_num, cmplnt_fr_dt, addr_pct_cd, crm_atpt_cptd_cd, law_cat_cd, susp_age_group, susp_race, susp_sex, vic_age_group, vic_race, vic_sex, ofns_desc, pd_desc, latitude, longitude) |>
rename(precinct = addr_pct_cd) |>
mutate(cmplnt_fr_dt = as.Date(cmplnt_fr_dt, format = "%m/%d/%Y"),
year = format(cmplnt_fr_dt, "%Y")) |>
left_join(prec_neighbor, by = "precinct")
#Import and clean socio-economic data. neighbor_SES has SES indicators (except rent) for each neighborhood
neighbor_ses = readxl::read_excel("data/neighorhood_indicators.xlsx", sheet = "Data") |>
janitor::clean_names() |>
filter(region_type == "Sub-Borough Area") |>
rename(neighborhood = region_name) |>
select(neighborhood, year, pop_num, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_edu_nohs_pct, pop_pov_pct, pop_race_asian_pct, pop_race_black_pct, pop_race_hisp_pct, pop_race_white_pct, pop_foreign_pct) |>
filter(year %in% c(2017, 2018, 2019, 2020, 2021, 2022))
#neighbor_rent has rent data for each neighborhood
neighbor_rent = readxl::read_excel("data/neighorhood_indicators.xlsx", sheet = "Data") |>
janitor::clean_names() |>
filter(region_type == "Sub-Borough Area") |>
rename(neighborhood = region_name) |>
filter(year == "2017-2021") |>
select(neighborhood, gross_rent_0_1beds, gross_rent_2_3beds)
#ses_df has crime rate and SES indicators for each neighborhood
ses_df = nypd_ses_df |>
group_by(year, borough, neighborhood) |>
summarise(crime_num = n())
ses_df = ses_df |> merge(neighbor_ses, by = c("year", "neighborhood")) |>
mutate(crime_rate = (crime_num/pop_num) * 100,000) |>
left_join(neighbor_rent, by = "neighborhood")
# NYC neighborhoods borders
nyc = read_sf(here::here("data", "Police_Precincts.geojson")) |>
select(-shape_area, -shape_leng) |>
mutate(
precinct = as.double(precinct)
)
# prepare data set for mapping
# population data by precinct (2020 census, P1_001N: total population)
df_pop = read_csv(here::here("data", "nyc_precinct_2020pop.csv")) |>
rename(pop = P1_001N) |>
select(precinct, pop)
# select ses variables
ses_heat = neighbor_ses |>
filter(year == 2021) |> # visualize 2021 data
mutate(
pop16_unemp_pct = pop16_unemp_pct * 100,
pop_edu_collp_pct = pop_edu_collp_pct * 100,
pop_pov_pct = pop_pov_pct * 100
) |>
select(neighborhood, pop_num, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_pov_pct)
# merge with nypd dataset and calculate relevant indicators
nypd_ses_heat = nypd_ses_df |>
filter(year == 2021) |>
left_join(ses_heat, by = "neighborhood") |> # combine with ses data
st_as_sf(
# which columns to use as coordinates
coords = c("longitude", "latitude"),
# keep the coordinate columns
remove = FALSE,
# projection system
crs = 4326
) |>
merge(df_pop, by = "precinct") |>
group_by(precinct) |>
mutate(
"crime_rate" = (sum(ofns_desc == "FELONY ASSAULT") / pop) * 100000, # calculate crime rate per 100,000 using 2020 population census data by precinct
"violation" = (sum(law_cat_cd == "VIOLATION") / n()) * 100, # the rate of violation among all complaints
"misdemeanor" = (sum(law_cat_cd == "MISDEMEANOR") / n()) * 100, # the rate of misdemeanor among all complaints
"felony" = (sum(law_cat_cd == "FELONY") / n()) * 100 # the rate of felony among all complaints
) |>
select(precinct, neighborhood, hh_inc_med_adj, pop16_unemp_pct, pop_edu_collp_pct, pop_pov_pct, geometry, crime_rate, felony, misdemeanor, violation)
# spatial join
df_spj = nypd_ses_heat |>
# spatial join
st_join(
# only need these columns from nyc tibble
nyc |> select(precinct, geometry),
# join rows where there is some overlap between a dock and a precinct
join = st_intersects,
left = FALSE
) |>
select(-precinct.y) |>
rename(precinct = precinct.x)
count_by_neighborhood = df_spj |>
# remove geometry for fast counting
st_drop_geometry() |>
distinct() |>
# join the counts into the nyc neighborhood object
right_join(nyc, by=c("precinct" = "precinct")) |>
mutate(
"pre_nei" = paste(precinct, "-", neighborhood, sep = " ")
) |>
st_as_sf() |>
drop_na() # drop central park due to the lack of data
# tmap
tmap_mode("view") # set tmap to the view mode
# tm_shape(count_by_neighborhood) +
# tm_polygons(col = "crime_rate", id = "pre_nei", breaks=c(0, 50, 100, 300, 600, 1000, 1600), title = "Crime rate per 100,000")
tm_shape(count_by_neighborhood) +
tm_polygons(col = "hh_inc_med_adj", id = "pre_nei", title = "Median household’s income (USD/yr)", palette = "Blues") +
tm_bubbles("crime_rate", id = "pre_nei", col = "azure", alpha = .5)
tm_shape(count_by_neighborhood) +
tm_polygons(col = "pop16_unemp_pct", id = "pre_nei", title = "Unemployment rate (%)") +
tm_bubbles("crime_rate", id = "pre_nei", col = "azure", alpha = .5)
tm_shape(count_by_neighborhood) +
tm_polygons(col = "pop_edu_collp_pct", id = "pre_nei", title = "College graduates (%)", palette = "Blues") +
tm_bubbles("crime_rate", id = "pre_nei", col = "azure", alpha = .5)